home *** CD-ROM | disk | FTP | other *** search
/ Cream of the Crop 22 / Cream of the Crop 22.iso / os2 / htm2txt1.zip / HTM2TXT.CMD < prev    next >
OS/2 REXX Batch file  |  1996-09-21  |  27KB  |  646 lines

  1. /* ----------------------------------------------------------------- */
  2.    lastmod='1996-09-21'
  3. /* 96-09-21 or rework follow href= error                     */
  4. /* 96-09-11 or <a ...> tag    due to etraas@te.xs4all.nl     */
  5. /* 96-08-21 or rework <table> tag error                      */
  6. /* 96-08-03 or rework <table> tag due to "Kirchner Soft"     */
  7. /* 96-08-02 or rework <table> tag due to pinkas@en.com       */
  8. /* 96-07-16 or follow href=                                  */
  9. /* 96-06-29 or rework                                        */
  10. /* 96-04-17 or try to support <table>                        */
  11. /* 96-01-15 or reworked                                      */
  12. /* 95-07-10 or decode HTML files                             */
  13. /* --------------------------------------------------------- */
  14. /* call:    htm2txt infile  [l nn [p nn [e editor [u         */
  15. /* output:  infile-name.TXT                                  */
  16. /*                                                           */
  17. /* recognised tags:                                          */
  18. /*                                                           */
  19. /*   all tags as supported by ibm webex 1.1b                 */
  20. /*                                                           */
  21. /* special tags:                                             */
  22. /*                                                           */
  23. /*   <trace>                            trace '?i'           */
  24. /*   <exit>                             exit immediately     */
  25. /*                                                           */
  26. /* recognised substitute variables see variable 'consts'     */
  27. /*            tab-char '    ' will be ignored                   */
  28. /*                                                           */
  29. /* notes:                                                    */
  30. /*                                                           */
  31. /*   all tags are converted as 'best fit'.                   */
  32. /*   the image a browser produces will not be met.           */
  33. /*                                                           */
  34. /* --------------------------------------------------------- */
  35. /* variables to be customized                                */
  36. /*                                                           */
  37.    linemax=72                /* maximum line length          */
  38.    pixlbyt= 6                /* pixels per byte for tables   */
  39.    editor ='e'               /* editor for output file       */
  40. /* --------------------------------------------------------- */
  41. /* constants as known of today                               */
  42. /*                                                           */
  43.    consts=       "&space '20'x"
  44.    consts=consts "   '20'x"
  45.    consts=consts "ß   ß"
  46.    consts=consts "ä    ä"
  47.    consts=consts "Ä    Ä"
  48.    consts=consts "ö    ö"
  49.    consts=consts "Ö    Ö"
  50.    consts=consts "ü    ü"
  51.    consts=consts "Ü    Ü"
  52.    consts=consts "á  á"
  53.    consts=consts "é  é"
  54.    consts=consts "í  í"
  55.    consts=consts "    '20'x"
  56.    consts=consts ":     :"
  57.    consts=consts "<     <"
  58.    consts=consts ">     >"
  59.    consts=consts "[     ["
  60.    consts=consts "]     ]"
  61.    consts=consts "’    '"   /* Æ             */
  62.    consts=consts '"    "'
  63.    consts=consts '‘    "'   /* æ             */
  64.    consts=consts '“    "'   /* ô             */
  65.    consts=consts '”    "'   /* ö             */
  66.    consts=consts "   'a0'x"
  67.    consts=consts "°    °"   /* ° EBCDIC !!!  */
  68.    consts=consts "¹    '"   /* ╣             */
  69.    consts=consts "&mdash   -"
  70.    consts=consts "<      <"
  71.    consts=consts ">      >"
  72.    consts=consts "&   '00'x"
  73.    consts=consts "©  '20'x"
  74. /* --------------------------------------------------------- */
  75. /* check input parameters                                    */
  76. /*                                                           */
  77.    parse upper arg ifiname options
  78.    if ifiname='' then                              exit 4
  79.  
  80.    parse var ifiname fn '.' ext
  81.    if ext='' then ifiname=ifiname'.HTM'
  82.    ofiname = fn'.TXT'
  83. /* --------------------------------------------------------- */
  84. /* check for options                                         */
  85. /*                                                           */
  86.    swi_url=0
  87.    do while options \= ''
  88.      parse upper var options opt val options
  89.      select
  90.        when opt='L' then linemax=val
  91.        when opt='P' then pixlbyt=val
  92.        when opt='E' then editor =val
  93.        when opt='U' then do
  94.                          swi_url=1
  95.                          options=val options
  96.                          end
  97.        otherwise nop
  98.        end
  99.      end
  100. /* --------------------------------------------------------- */
  101. /* activate debug facilities                                 */
  102. /*                                                           */
  103. /*
  104.    signal on syntax
  105.    signal on error
  106.    signal on failure
  107.    signal on halt
  108. */
  109. /* --------------------------------------------------------- */
  110. /* some global controls                                      */
  111. /*                                                           */
  112.    hrf.0=1                   /* href-control                 */
  113.    hrf.1=ifiname
  114.  
  115.    'erase' ofiname '2>NUL'
  116.  
  117.    call lineout ofiname,'HTM2TXT v.' lastmod
  118.    call lineout ofiname,' '
  119.    call lineout ofiname,'Extracted from' ifiname',' date()',' left(time(),5)
  120.    call lineout ofiname,' '
  121.  
  122.    do nexthrf=1 while hrf.0>=nexthrf
  123.      call process_file hrf.nexthrf
  124.      end
  125.    call lineout ofiname
  126.    say
  127.  
  128.    if editor \= '' then 'start /F' editor ofiname /*  <=== edit result            */
  129. /* --------------------------------------------------------- */
  130.                                                       exit 0
  131. /* --------------------------------------------------------- */
  132. /* process a file                                            */
  133. /*                                                           */
  134. process_file: parse arg ifiname
  135. /* --------------------------------------------------------- */
  136. /* read infile                                               */
  137. /*                                                           */
  138.    nl ='0d'x                 /* new line character           */
  139.    ifi=''
  140.    say
  141.    say 'reading' ifiname
  142.    call stream ifiname,'c','close'
  143.    do i=1 while chars(ifiname)>1
  144.      l=linein(ifiname)||nl
  145.      l=translate(l,' ','09'x)
  146.      ifi = ifi||l
  147.      end
  148.    call stream ifiname,'c','close'
  149.    say i-1 'records read from' ifiname
  150. /* --------------------------------------------------------- */
  151. /* format outfile lines                                      */
  152. /*                                                           */
  153.    ofi.0=0                   /* out file controls            */
  154.    dlspaces  =''             /* <DL>-spaces                  */
  155.    lispaces  =''             /* <LI>-spaces                  */
  156.    indents   =0              /* number of indents            */
  157.    blanklines=0              /* number of blank lines        */
  158.    linelen   =linemax        /* max. linelength              */
  159.    outtext   =''             /* initial text                 */
  160.  
  161.                              /* switches:                    */
  162.    swi_pre   = 0             /* switch PRE                   */
  163.    swi_tbl   = 0             /* switch table definition      */
  164.    swi_lst   = 0             /* switch list  definition      */
  165.    swi_cnt   = 0             /* switch center text           */
  166.    swi_cat   = 0             /* switch concatenate           */
  167.    swi_trc   = 0             /* switch trace                 */
  168.  
  169. /* --------------------------------------------------------- */
  170. /* scan input stream                                         */
  171. /*                                                           */
  172.    call charout ,'processing token '
  173.    text=''
  174.  
  175.    do count=1 while length(ifi)>0
  176.  
  177.      call charout ,format(count,5) copies('08'x,6)
  178.  
  179.      if swi_trc then trace 'i'
  180.  
  181.                                /* check next line            */
  182.      parse var ifi parttext '<' tag '>' ifi
  183.                                /* process text               */
  184.      select
  185.        when swi_pre            then call process_preformatted
  186.        when strip(parttext)=nl then nop
  187.        otherwise               do
  188.                                  do while pos(nl,parttext)>0
  189.                                    parse var parttext a (nl) b
  190.                                    parttext=strip(a) strip(b)
  191.                                    end
  192.                                  if swi_cat then text=text||parttext
  193.                                  else do
  194.                                    if text='' then text=     parttext
  195.                                               else text=text parttext
  196.                                    end
  197.                                  end
  198.        end
  199.                                /* process tag                */
  200.      tag=translate(tag,' ',nl)
  201.      if left(tag,1)='!' then tag='!' substr(tag,2)
  202.      parse var tag tag options
  203.      tag=translate(tag)
  204.      swi_cat=0
  205.  
  206.      select
  207.        when tag='TRACE'    then  swi_trc=1
  208.        when tag='EXIT'     then  signal finish
  209.  
  210.        when tag='!'        then  call out '***' options '***'
  211.        when tag='FONT'     then  swi_cat=1
  212.        when tag='UL',
  213.        |    tag='OL',
  214.        |    tag='DL',
  215.        |    tag='DIR',
  216.        |    tag='MENU',
  217.                            then do
  218.                                  call out text
  219.                                  call out ' '
  220.                                  lispaces=' * '
  221.                                  indents=indents+1
  222.                                  swi_lst=1
  223.                                  end
  224.        when tag='LI'       then  call out text
  225.        when tag='DT'       then do
  226.                                  call out text
  227.                                  lispaces=' * '
  228.                                  if indents>0 then indents=indents-1
  229.                                  end
  230.        when tag='DD'       then do
  231.                                  call out text
  232.                                  lispaces='    '
  233.                                  indents=indents+1
  234.                                  end
  235.        when tag='/UL',
  236.        |    tag='/OL',
  237.        |    tag='/DL',
  238.        |    tag='/DIR',
  239.        |    tag='/MENU',
  240.                            then do
  241.                                  call out text
  242.                                  lispaces=''
  243.                                  if indents>0 then indents=indents-1
  244.                                  call out ' '
  245.                                  swi_lst=0
  246.                                  end
  247.        when tag='CENTER',
  248.        |    tag='CENTRE',
  249.                            then  swi_cnt=1
  250.        when tag='/CENTER',
  251.        |    tag='/CENTRE',
  252.                            then do
  253.                                  swi_cnt=0
  254.                                  call out text
  255.                                  end
  256.        when tag='P',
  257.        |    tag='/TITLE',
  258.        |    tag='/CENTER',
  259.        |    tag='/CENTRE',
  260.                            then  call out text
  261.        when tag='/HEAD',
  262.                            then do
  263.                                  call out text
  264.                                  call out ' '
  265.                                  end
  266.        when tag='PRE'      then do
  267.                                  swi_pre=1
  268.                                  linelen=parmval('WIDTH',options)
  269.                                  end
  270.        when tag='/PRE'     then do
  271.                                  swi_pre=0
  272.                                  linelen=linemax
  273.                                  end
  274.  
  275.        when tag='HR'       then call out copies('-',linelen)
  276.  
  277.        when tag='H1',
  278.        |    tag='H2',
  279.        |    tag='H3',
  280.        |    tag='H4',
  281.        |    tag='/H1',
  282.        |    tag='/H2',
  283.        |    tag='/H3',
  284.        |    tag='/H4',
  285.        |    tag='/CAPTION',
  286.                            then do
  287.                                  call out text
  288.                                  call out ' '
  289.                                  end
  290.        when tag='A'        then do
  291.                                  parse upper var options 'HREF' . '"' hrefid '"'
  292.                                  nogo= pos('#',hrefid)>0
  293.                                  srefid=''
  294.                                  if swi_url,
  295.                                  &  \nogo then do
  296.                                    srefid=hrefid
  297.                                    end
  298.                                  parse var hrefid z '.' fext
  299.                                  nogo=nogo|(left(fext,3)\='HTM')
  300.                                  parse var hrefid z 'FILE:' hrefid
  301.                                  if hrefid='' then hrefid=z
  302.                                  nogo=nogo|(strip(hrefid)='')
  303.                                  do i=1 to hrf.0
  304.                                    if hrf.i=hrefid then leave
  305.                                    end
  306.                                  if (i>hrf.0)&(\nogo) then do
  307.                                    hrf.0=hrf.0+1; z=hrf.0; hrf.z=hrefid
  308.                                    end
  309.                                  end
  310.        when tag='/A'       then do
  311.                                  if swi_url,
  312.                                  &  srefid\='' then do
  313.                                    text=text '('srefid')'
  314.                                    srefid=''
  315.                                    end
  316.                                  end
  317. /*
  318.        when tag='IMG'      then do
  319.                                  z=parmval('ALT',options)
  320.                                  if z\=0 then do
  321.                                    if swi_tbl then do
  322.                                                    text=z
  323.                                                    call save_table_text
  324.                                                    end
  325.                                    else text=text z
  326.                                    end
  327.                                  end
  328. */
  329.        when tag='TABLE'    then do
  330.                                  call out text
  331.                                  call out ' '
  332.                                  swi_tbl=1
  333.                                  swi_wid=1
  334.                                  tbwid. =0
  335.                                  end
  336.        when tag='TR'       then do
  337.                                  tbcol=0
  338.                                  tbmax=0
  339.                                  drop tbtxt.
  340.                                  end
  341.        when tag='TD'       then do
  342.                                        /* determine next column    */
  343.  
  344.                                  z=parmval('COLSTART',options)
  345.                                  if z=0 then tbcol=tbcol+1
  346.                                         else tbcol=z
  347.                                  if tbmax<tbcol then tbmax=tbcol
  348.  
  349.                                        /* check for width= tag     */
  350.  
  351.                                  p=parmval('WIDTH',options)
  352.                                  if p>0 then do
  353.                                    select
  354.                                      when right(p,3)='PIX' then do
  355.                                        parse var p n 'PIX' .
  356.                                        tbwid.tbcol=n%pixlbyt
  357.                                        end
  358.                                      when right(p,1)='%'   then do
  359.                                        parse var p n '%' .
  360.                                        tbwid.tbcol=(n*linelen)%100
  361.                                        end
  362.                                      otherwise
  363.                                      if p>linemax then p=linemax
  364.                                      tbwid.tbcol=p
  365.                                      end
  366.                                    end
  367.  
  368.                                        /* set lines/col to 0       */
  369.  
  370.                                  tblin.tbcol=0
  371.                                  end
  372.  
  373.        when tag='/TD'      then do
  374.                                  if swi_tbl then call save_table_text
  375.                                  end
  376.  
  377.        when tag='/TR'      then do
  378.                                  if swi_tbl then do
  379.  
  380.                                    /* col-width already done ?     */
  381.  
  382.                                    if swi_wid then do
  383.                                      swi_wid=0
  384.  
  385.                                          /* check predefined col-width */
  386.  
  387.                                      colwi=0
  388.                                      do i=1 to tbmax
  389.                                        colwi=colwi+tbwid.i
  390.                                        end
  391.                                      linelen=linemax-colwi
  392.                                      if linelen<=0 then linelen=linemax
  393.  
  394.                                          /* set col-width if not set   */
  395.  
  396.                                      do i=1 to tbmax
  397.                                        if tbwid.i>0 then iterate
  398.                                        tbwid.i=linelen%tbmax
  399.                                        end
  400.                                      linelen=linemax
  401.  
  402.                                          /* check sum colwid exceeds   */
  403.  
  404.                                      sum_col=0
  405.                                      do i=1 to tbmax
  406.                                        sum_col=sum_col+tbwid.i
  407.                                        end
  408.                                      if sum_col>linemax then do
  409.                                        ratio=linemax/sum_col
  410.                                        do i=1 to tbmax
  411.                                          tbwid.i=trunc(tbwid.i/ratio)
  412.                                          end
  413.                                        end
  414.                                      end
  415.  
  416.                                        /* get max nr. lines in row   */
  417.  
  418.                                    lnmax=1
  419.                                    do i=1 to tbmax
  420.                                      if lnmax<tblin.i then lnmax=tblin.i
  421.                                      end
  422.  
  423.                                        /* fill uninitlzd variables   */
  424.  
  425.                                    do y=1 to lnmax
  426.                                      do k=1 to tbmax
  427.                                        tbtxt.k.y=subs(tbtxt.k.y)
  428.                                        if left(tbtxt.k.y,6)\='TBTXT.' then iterate
  429.                                        if k=1 then tbtxt.k.y='_'
  430.                                               else tbtxt.k.y=''
  431.                                        end
  432.                                      end
  433.  
  434.                                        /* scan all lines all cols    */
  435.  
  436.                                    do y=1 to lnmax
  437.                                      anytxt=0
  438.                                      do k=1 to tbmax
  439.                                        if strip(tbtxt.k.y)='' then iterate
  440.                                        anytxt=1
  441.                                        leave
  442.                                        end
  443.  
  444.                                      do while anytxt
  445.                                          anytxt=0
  446.                                        do k=1 to tbmax
  447.  
  448.                                          /* check length fits          */
  449.  
  450.                                          if length(tbtxt.k.y)>tbwid.k ,
  451.                                          &  tbwid.k>0 then do
  452.                                            z=lastpos(' ',tbtxt.k.y,tbwid.k)
  453.                                            if z=0 then do  /* give up    */
  454.                                                          otext=tbtxt.k.y
  455.                                                          tbtxt.k.y=''
  456.                                                          end
  457.                                                   else do  /* split text */
  458.                                                          otext=left(tbtxt.k.y,z)
  459.                                                          tbtxt.k.y=substr(tbtxt.k.y,z)
  460.                                                          anytxt=1
  461.                                                          end
  462.                                            end
  463.                                          else do
  464.                                            otext=tbtxt.k.y
  465.                                            tbtxt.k.y=''
  466.                                            end
  467.                                          tbtxt.1.y='_'
  468.  
  469.                                          /*  build output line       */
  470.  
  471.                                          text=text left(otext,tbwid.k)
  472.                                          end
  473.  
  474.                                          /* all cols processed         */
  475.  
  476.                                        call out_table_text
  477.                                        end
  478.                                      end
  479.                                    end
  480.                                  end
  481.  
  482.        when tag='/TABLE'   then do
  483.                                  blanklines=0
  484.                                  call out ' '
  485.                                  swi_tbl=0
  486.                                  end
  487.        when tag='BR'       then do
  488.                                  if swi_lst  then call out text
  489.                                  if swi_tbl ,
  490.                                  & (tbmax>1) then call save_table_text
  491.                                              else call out text
  492.                                  end
  493.        otherwise nop
  494.        end
  495.                                /* all finished               */
  496.      end
  497. /* --------------------------------------------------------- */
  498. /* write outfile                                             */
  499. /*                                                           */
  500. finish:
  501.  
  502.    say
  503.    do i=1 to ofi.0
  504.      call lineout ofiname,ofi.i
  505.      end
  506. /* --------------------------------------------------------- */
  507.                                                       return
  508. /* ========================================================= */
  509. /* --------------------------------------------------------- */
  510. /* save table-text                                           */
  511. /*                                                           */
  512.    save_table_text:
  513.  
  514.      if strip(text)=''         then return
  515.  
  516.      tblin.tbcol=tblin.tbcol+1
  517.      z=tblin.tbcol
  518.      tbtxt.tbcol.z=text
  519.      text=''
  520.                                                       return
  521. /* --------------------------------------------------------- */
  522. /* out  table-text                                           */
  523. /*                                                           */
  524.    out_table_text:
  525.  
  526.      text = strip(text)
  527.      if text  =''   then                              return
  528.      if text \= '_' then call o text
  529.      text = ''
  530.                                                       return
  531. /* --------------------------------------------------------- */
  532. /* process preformatted                                      */
  533. /*                                                           */
  534. process_preformatted:
  535.  
  536.    do while length(parttext)>0
  537.      parse var parttext outtext (nl) parttext
  538.      call out outtext
  539.      end
  540.                                                     return
  541. /* --------------------------------------------------------- */
  542. /* extract parameter values                                  */
  543. /*                                                           */
  544. parmval: procedure; parse upper arg key,string
  545.  
  546.    z=pos(key,string)
  547.    if z=0 then                                  return 0
  548.    string=substr(string,z)
  549.    parse var string '=' val  .
  550.    val=translate(val,' ','"')
  551.    val=translate(strip(val))
  552.                                                 return val
  553. /* --------------------------------------------------------- */
  554. /* do output lines                                           */
  555. /*                                                           */
  556. out:
  557.  
  558.    oli=subs(arg(1))
  559.    oll=length(oli)
  560.                      /* do not output more than 1 blank line */
  561.    if oll=0 then do
  562.      if blanklines>0 then                       return
  563.      blanklines=blanklines+1
  564.      end
  565.  
  566.    if linelen>0 then do
  567.      do while oll>linelen
  568.        z=lastpos(' ',oli,linelen)
  569.        if z=0 then z=oll
  570.        if (z>0) then do
  571.                      call o left(oli,z)
  572.                      oli=strip(substr(oli,z+1))
  573.                      oll=length(oli)
  574.                      end
  575.        end
  576.      end
  577.    call o oli
  578.    if oll>0 then blanklines=0
  579.    text=''
  580.                                                      return
  581. o: procedure expose swi_cnt linelen indents dlspaces lispaces ofi.
  582.                             parse arg ooo
  583.    if swi_cnt then do
  584.      z=(linelen-length(ooo))%2
  585.      if z>0 then prefix=copies(' ',z)
  586.             else prefix=''
  587.      end
  588.    else do
  589.      prefix=copies(' ',indents)||lispaces||dlspaces
  590.      end
  591.    ofi.0=ofi.0+1; z=ofi.0; ofi.z=prefix||ooo
  592.                                                      return
  593. /* --------------------------------------------------------- */
  594. /* substitute constants                                      */
  595. /*                                                           */
  596.    subs: procedure expose consts;
  597.  
  598.      l = arg(1)
  599.                                /* check for tab chars        */
  600.      l=translate(l,' ','09'x)
  601.                                /* check for variables        */
  602.      z=pos('&',l)
  603.      if z=0 then                       return strip(l)
  604.  
  605.      do while z > 0
  606.        head  = left(l,z-1)
  607.        token = substr(l,z)
  608.        do i=1 to words(consts) by 2
  609.          a=word(consts,i)
  610.          b=length(a)
  611.          c=left(token,b)
  612.          d=word(consts,i+1)
  613.          if right(d,2)="'x" then interpret "d="d
  614.          if c=a then do
  615.            head=head||d
  616.            token=substr(token,b+2)
  617.            leave
  618.            end
  619.          end
  620.        if i>words(consts) then do
  621.          token='?'substr(token,2)
  622.          end
  623.        l = head||token
  624.        z=pos('&',l)
  625.        end
  626.                         return strip(translate(l,'&','00'x))
  627. /* --------------------------------------------------------- */
  628.    syntax:
  629.      say 'signal on syntax in'  sigl':' strip(sourceline(sigl))
  630.      signal common_error
  631.    error:
  632.      say 'signal on error in'   sigl':' strip(sourceline(sigl))
  633.      signal common_error
  634.    failure:
  635.      say 'signal on failure in' sigl':' strip(sourceline(sigl))
  636.      signal common_error
  637.    halt:
  638.      say 'signal on halt in'    sigl':' strip(sourceline(sigl))
  639.      signal common_error
  640.    common_error:
  641.      trace '?i'
  642.      do forever
  643.        nop
  644.        end
  645. /* --------------------------------------------------------- */
  646.